{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# amapa_1819"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'amapa_1819'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/amapa_1819/'  \n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "The labels are very neat, contained in four CSVs. \n",
    "- Some entries in the CSVs are not present in blob (probably the people images).\n",
    "- The \"RelativePath\" column is the location. \n",
    "- Sequence info can be extracted from the file names\n",
    "- There isn't a good identifier for each sequence, but the timestamp up to the minute is an okay divider between sequences. It seems that the camera would start numbering from (1) again in a new minute.\n",
    "\n",
    "Images were AzCopied to the data disk first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24221"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "19248"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files_list = path_utils.recursive_file_list(container_root, convert_slashes=False)\n",
    "len(files_list)\n",
    "\n",
    "images_set = set([i.split(container_root)[1] for i in files_list if path_utils.is_image_file(i)])\n",
    "len(images_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Cameras_Praias_2019_Microsoft/2012P13/Site2012P13-11.09/VID_0469_11090469.JPG',\n",
       " 'Cameras_Praias_2019_Microsoft/2017P2/Site2017P2-10.27/VID_0029_EK000029.JPG',\n",
       " 'Cameras_Praias_2018/C2017P7_escondida/Site2017P7_escondida-10.19/VID_0748_10190002.JPG',\n",
       " 'Cameras_Praias_2018/CBF27/SiteBF27-11.03/VID_0505_11030305.JPG',\n",
       " 'Cameras_Praias_2018/C2012P032/Site2012P032-10.04/VID_0555_10040195.JPG',\n",
       " 'Cameras_Praias_2018/C2017P2/Site2017P2-11.09/VID_0026_EK000026.JPG',\n",
       " 'Cameras_Praias_2019_Microsoft/2012P16/Site2012P16-11.23/VID_2380_11230862.JPG',\n",
       " 'Cameras_Praias_2018/C2012P16/Site2012P16-10.07/VID_0552_10070552.JPG',\n",
       " 'Cameras_Praias_2018/C2012P181/Site2012P181-10.25/VID_0369_10250014.JPG',\n",
       " 'Cameras_Praias_2018/C2012P031/Site2012P031-11.02/VID_0398_11020130.JPG']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(images_set)[1000:1010]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the CSV lables from RSPB\n",
    "csv_paths = [\n",
    "    os.path.join(container_root, 'Cameras_Praias_2018/Cameras_Praias_2018_Microsoft/Timelapse_cameras_praias2018.csv'),\n",
    "    os.path.join(container_root, 'Cameras_Praias_2019_Microsoft/Timelapse_cameras_praias2019.csv')\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14736\n",
      "11102\n"
     ]
    }
   ],
   "source": [
    "csv_dfs = []\n",
    "for p in csv_paths:\n",
    "    csv = pd.read_csv(p)\n",
    "    print(len(csv))\n",
    "    csv_dfs.append(csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25838, 26)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(19221, 26)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv = pd.concat(csv_dfs, ignore_index=True)\n",
    "all_csv.shape\n",
    "\n",
    "# filter to non-video files\n",
    "all_csv = all_csv[all_csv['video'] == False]\n",
    "all_csv.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['File', 'RelativePath', 'Folder', 'Date', 'Time', 'ImageQuality',\n",
       "       'DeleteFlag', 'video', 'analyst', 'mammal', 'bird', 'reptile', 'people',\n",
       "       'boat', 'individuals', 'youngpresent', 'species1', 'species2',\n",
       "       'behavior1', 'behavior2', 'behavior3', 'behaviorobs', 'observation',\n",
       "       'publicity', 'completed', 'Unnamed: 25'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>File</th>\n",
       "      <th>RelativePath</th>\n",
       "      <th>Folder</th>\n",
       "      <th>Date</th>\n",
       "      <th>Time</th>\n",
       "      <th>ImageQuality</th>\n",
       "      <th>DeleteFlag</th>\n",
       "      <th>video</th>\n",
       "      <th>analyst</th>\n",
       "      <th>mammal</th>\n",
       "      <th>...</th>\n",
       "      <th>species1</th>\n",
       "      <th>species2</th>\n",
       "      <th>behavior1</th>\n",
       "      <th>behavior2</th>\n",
       "      <th>behavior3</th>\n",
       "      <th>behaviorobs</th>\n",
       "      <th>observation</th>\n",
       "      <th>publicity</th>\n",
       "      <th>completed</th>\n",
       "      <th>Unnamed: 25</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11844</th>\n",
       "      <td>VID_0054_10190054.JPG</td>\n",
       "      <td>C2017P4\\Site2017P4-10.19</td>\n",
       "      <td>Cameras_Praias_2018</td>\n",
       "      <td>19-Oct-2018</td>\n",
       "      <td>07:59:02</td>\n",
       "      <td>Ok</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>FMichalski</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>People</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21447</th>\n",
       "      <td>VID_3098_11240581.JPG</td>\n",
       "      <td>2012P16\\Site2012P16-11.24</td>\n",
       "      <td>Cameras_Praias_2019</td>\n",
       "      <td>24-Nov-2019</td>\n",
       "      <td>23:23:49</td>\n",
       "      <td>Ok</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>FMichalski</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NI</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24275</th>\n",
       "      <td>VID_0189_11020189.JPG</td>\n",
       "      <td>2012P27\\Site2012P27-11.02</td>\n",
       "      <td>Cameras_Praias_2019</td>\n",
       "      <td>02-Nov-2019</td>\n",
       "      <td>14:59:13</td>\n",
       "      <td>Ok</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>FMichalski</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        File               RelativePath               Folder  \\\n",
       "11844  VID_0054_10190054.JPG   C2017P4\\Site2017P4-10.19  Cameras_Praias_2018   \n",
       "21447  VID_3098_11240581.JPG  2012P16\\Site2012P16-11.24  Cameras_Praias_2019   \n",
       "24275  VID_0189_11020189.JPG  2012P27\\Site2012P27-11.02  Cameras_Praias_2019   \n",
       "\n",
       "              Date      Time ImageQuality  DeleteFlag  video     analyst  \\\n",
       "11844  19-Oct-2018  07:59:02           Ok       False  False  FMichalski   \n",
       "21447  24-Nov-2019  23:23:49           Ok       False  False  FMichalski   \n",
       "24275  02-Nov-2019  14:59:13           Ok       False  False  FMichalski   \n",
       "\n",
       "       mammal  ...  species1  species2 behavior1  behavior2  behavior3  \\\n",
       "11844   False  ...    People       NaN       NaN        NaN        NaN   \n",
       "21447   False  ...        NI       NaN       NaN        NaN        NaN   \n",
       "24275   False  ...       NaN       NaN       NaN        NaN        NaN   \n",
       "\n",
       "       behaviorobs observation publicity completed Unnamed: 25  \n",
       "11844          NaN         NaN     False      True         NaN  \n",
       "21447          NaN         NaN     False      True         NaN  \n",
       "24275          NaN         NaN     False      True         NaN  \n",
       "\n",
       "[3 rows x 26 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.bool_"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(all_csv.iloc[6630]['video'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "People\n",
      "nan\n",
      "Small lizard\n",
      "Crax alector\n",
      "Molothrus sp.\n",
      "Small bird\n",
      "Monasa atra\n",
      "Iguana iguana\n",
      "Hydrochoerus hydrochaeris\n",
      "Ameiva sp.\n",
      "Cuniculus paca\n",
      "Urubitinga urubitinga\n",
      "Tayassu pecari\n",
      "Leptotila sp.\n",
      "Mesembrinibis cayennensis\n",
      "Cathartes sp.\n",
      "Molothrus oryzivorus\n",
      "Podocnemis unifilis\n",
      "Psarocolius decumanus\n",
      "Frog\n",
      "Proechimys guyannensis\n",
      "Cathartes melambrotus\n",
      "Geotrygon sp.\n",
      "Leopardus wiedii\n",
      "Cochlearius cochlearius\n",
      "Leopardus pardalis\n",
      "Leptotila verreauxi\n",
      "Myrmecophaga tridactyla\n",
      "Insect\n",
      "Lontra longicaudis\n",
      "Ardea cocoi\n",
      "Anas platyrhynchos domesticus\n",
      "Canis lupus familiaris\n",
      "Didelphis marsupialis\n",
      "Coragyps atratus\n",
      "Tupinambis teguixin\n",
      "Sarcoramphus papa\n",
      "NI\n",
      "Crotophaga ani\n",
      "Eira barbara\n",
      "Calidris sp.\n",
      "Phalacrocorax brasilianus\n",
      "Pilherodius pileatus\n",
      "Mazama americana\n",
      "Panthera onca\n",
      "Puma concolor\n",
      "Cacicus cela\n",
      "Tigrisoma lineatum\n",
      "Tapirus terrestris\n",
      "Butorides striata\n",
      "Galictis vittata\n",
      "Speothos venaticus\n",
      "Dasyprocta leporina\n"
     ]
    }
   ],
   "source": [
    "for i in pd.unique(all_csv['species1']):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nan\n",
      "Small bird\n",
      "Anas platyrhynchos domesticus\n",
      "People\n",
      "Ameiva sp.\n",
      "Coragyps atratus\n",
      "Canis lupus familiaris\n",
      "Frog\n"
     ]
    }
   ],
   "source": [
    "for i in pd.unique(all_csv['species2']):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Finding the sequences\n",
    "The sequences seem to be indicated by when there is a video... The entire RelativePath seems to be a good proxy for a sequence, even though it may contain difference sequences at the same location (sometimes the camera field of vision is somewhat changed...)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "19221it [00:03, 5906.19it/s]\n"
     ]
    }
   ],
   "source": [
    "locations = set()\n",
    "embedded = []\n",
    "num_images = 0\n",
    "\n",
    "missing_images = []\n",
    "\n",
    "seq_id = 0\n",
    "num_last = 0\n",
    "frame_counter = 1\n",
    "\n",
    "for i_row, row in tqdm(all_csv.iterrows()):  # have to go in order for this to work\n",
    "    if not path_utils.is_image_file(row['File']):\n",
    "        continue\n",
    "    num_images += 1\n",
    "    frame_counter += 1\n",
    "    \n",
    "    # figuring out where a new sequence starts\n",
    "    # new sequence if a video file breaks the numbering of file names\n",
    "    fn = row['File']\n",
    "    num = int(fn.split('_')[1])\n",
    "    if num > num_last + 1 or num < num_last:  # could wrap around\n",
    "        # new sequence\n",
    "        seq_id += 1\n",
    "        frame_counter = 1\n",
    "    num_last = num\n",
    "    \n",
    "    rel_path = row['RelativePath'].replace('\\\\', '/')\n",
    "    location = rel_path.split('/')[0]\n",
    "    folder = row['Folder']\n",
    "    if folder == 'Cameras_Praias_2019':\n",
    "        folder = 'Cameras_Praias_2019_Microsoft'\n",
    "    im_path = os.path.join(folder, rel_path, fn)\n",
    "    \n",
    "    if im_path not in images_set:\n",
    "        missing_images.append(im_path)\n",
    "        continue\n",
    "    \n",
    "    datetime = row['Date'] + ' ' + row['Time']\n",
    "    \n",
    "    classes = []\n",
    "    if row['boat'] == True:\n",
    "        classes.append('boat')\n",
    "    \n",
    "    if isinstance(row['species1'], str) and row['species1'] != '':\n",
    "        classes.append(row['species1'].lower())\n",
    "        \n",
    "    if isinstance(row['species2'], str) and row['species2'] != '':\n",
    "        classes.append(row['species2'].lower())  \n",
    "    \n",
    "    if len(classes) == 0:\n",
    "        assert row['individuals'] == 0\n",
    "        classes.append('empty')\n",
    "    \n",
    "    embedded.append({\n",
    "        'file': im_path,\n",
    "        'class': classes,\n",
    "        'frame_num': frame_counter,\n",
    "        'datetime': datetime,\n",
    "        'count': row['individuals'],\n",
    "        'analyst': row['analyst'],\n",
    "        'observation': row['observation'] if isinstance(row['observation'], str) else None,\n",
    "\n",
    "        'location': location,\n",
    "        'seq_id': seq_id\n",
    "    })\n",
    "    locations.add(location)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19221"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5552"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_images\n",
    "seq_id # about 3.46 images per sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of locations\n",
    "len(locations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(missing_images)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to amapa_1819. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 19221/19221 [00:00<00:00, 858696.46it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 19221 images into sequences...\n",
      "Number of sequences: 5552\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'frame_num', 'observation', 'analyst', 'count', 'file', 'class', 'location', 'datetime'}\n",
      "\n",
      "img_level_properties\n",
      "{'frame_num', 'observation', 'count', 'file', 'class', 'datetime'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'location', 'analyst'}\n",
      "\n",
      "! Sequence-level property analyst with value FMichalski should be a dataset-level property. Removed from sequences.\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"amapa_1819\", \"seq_id\": \"1\", \"location\": \"C2012P031\", \"images\": [{\"file\": \"Cameras_Praias_2018/C2012P031/Site2012P031-09.30/VID_0103_09300001.JPG\", \"class\": [\"people\"], \"frame_num\": 1, \"datetime\": \"30-Sep-2018 10:40:49\", \"count\": 1, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P031/Site2012P031-09.30/VID_0104_09300002.JPG\", \"class\": [\"people\"], \"frame_num\": 2, \"datetime\": \"30-Sep-2018 10:40:50\", \"count\": 1, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P031/Site2012P031-09.30/VID_0105_09300003.JPG\", \"class\": [\"empty\"], \"frame_num\": 3, \"datetime\": \"30-Sep-2018 10:40:51\", \"count\": 0, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P031/Site2012P031-10.01/VID_0106_10010004.JPG\", \"class\": [\"small lizard\"], \"frame_num\": 4, \"datetime\": \"01-Oct-2018 10:35:53\", \"count\": 1, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P031/Site2012P031-10.01/VID_0107_10010005.JPG\", \"class\": [\"empty\"], \"frame_num\": 5, \"datetime\": \"01-Oct-2018 10:35:54\", \"count\": 0, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P031/Site2012P031-10.01/VID_0108_10010006.JPG\", \"class\": [\"small lizard\"], \"frame_num\": 6, \"datetime\": \"01-Oct-2018 10:35:54\", \"count\": 1, \"observation\": null}]}\n",
      "\n",
      "{\"dataset\": \"amapa_1819\", \"seq_id\": \"1040\", \"location\": \"C2012P091\", \"images\": [{\"file\": \"Cameras_Praias_2018/C2012P091/Site2012P091-11.10/VID_1623_11100173.JPG\", \"class\": [\"hydrochoerus hydrochaeris\"], \"frame_num\": 1, \"datetime\": \"10-Nov-2018 05:00:45\", \"count\": 1, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P091/Site2012P091-11.10/VID_1624_11100174.JPG\", \"class\": [\"hydrochoerus hydrochaeris\"], \"frame_num\": 2, \"datetime\": \"10-Nov-2018 05:00:45\", \"count\": 1, \"observation\": null}, {\"file\": \"Cameras_Praias_2018/C2012P091/Site2012P091-11.10/VID_1625_11100175.JPG\", \"class\": [\"hydrochoerus hydrochaeris\"], \"frame_num\": 3, \"datetime\": \"10-Nov-2018 05:00:46\", \"count\": 1, \"observation\": null}]}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "sequences = process_sequences(embedded, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[OrderedDict([('dataset', 'amapa_1819'),\n",
       "              ('seq_id', '2069'),\n",
       "              ('location', 'C2012P181'),\n",
       "              ('images',\n",
       "               [{'file': 'Cameras_Praias_2018/C2012P181/Site2012P181-12.04/VID_0719_12040141.JPG',\n",
       "                 'class': ['anas platyrhynchos domesticus'],\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '04-Dec-2018 13:53:49',\n",
       "                 'count': 1,\n",
       "                 'observation': None},\n",
       "                {'file': 'Cameras_Praias_2018/C2012P181/Site2012P181-12.04/VID_0720_12040142.JPG',\n",
       "                 'class': ['anas platyrhynchos domesticus'],\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '04-Dec-2018 13:53:50',\n",
       "                 'count': 1,\n",
       "                 'observation': None},\n",
       "                {'file': 'Cameras_Praias_2018/C2012P181/Site2012P181-12.04/VID_0721_12040143.JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '04-Dec-2018 13:53:51',\n",
       "                 'count': 0,\n",
       "                 'observation': None}])]),\n",
       " OrderedDict([('dataset', 'amapa_1819'),\n",
       "              ('seq_id', '4701'),\n",
       "              ('location', '2012P16'),\n",
       "              ('images',\n",
       "               [{'file': 'Cameras_Praias_2019_Microsoft/2012P16/Site2012P16-11.23/VID_2187_11230669.JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '23-Nov-2019 09:48:07',\n",
       "                 'count': 0,\n",
       "                 'observation': None},\n",
       "                {'file': 'Cameras_Praias_2019_Microsoft/2012P16/Site2012P16-11.23/VID_2188_11230670.JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '23-Nov-2019 09:48:08',\n",
       "                 'count': 0,\n",
       "                 'observation': None},\n",
       "                {'file': 'Cameras_Praias_2019_Microsoft/2012P16/Site2012P16-11.23/VID_2189_11230671.JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '23-Nov-2019 09:48:09',\n",
       "                 'count': 0,\n",
       "                 'observation': None}])]),\n",
       " OrderedDict([('dataset', 'amapa_1819'),\n",
       "              ('seq_id', '2201'),\n",
       "              ('location', 'C2012P242'),\n",
       "              ('images',\n",
       "               [{'file': 'Cameras_Praias_2018/C2012P242/Site2012P242-11.02/VID_0413_11020413.JPG',\n",
       "                 'class': ['cochlearius cochlearius'],\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '02-Nov-2018 18:47:16',\n",
       "                 'count': 1,\n",
       "                 'observation': None},\n",
       "                {'file': 'Cameras_Praias_2018/C2012P242/Site2012P242-11.02/VID_0414_11020414.JPG',\n",
       "                 'class': ['cochlearius cochlearius'],\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '02-Nov-2018 18:47:17',\n",
       "                 'count': 1,\n",
       "                 'observation': None},\n",
       "                {'file': 'Cameras_Praias_2018/C2012P242/Site2012P242-11.02/VID_0415_11020415.JPG',\n",
       "                 'class': ['cochlearius cochlearius'],\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '02-Nov-2018 18:47:18',\n",
       "                 'count': 1,\n",
       "                 'observation': None}])])]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(sequences, 3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 1.64 s, sys: 0 ns, total: 1.64 s\n",
      "Wall time: 1.65 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - copy images to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5552/5552 [00:00<00:00, 6431.67it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 90.2 ms, sys: 41.9 ms, total: 132 ms\n",
      "Wall time: 866 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "for seq in tqdm(sequences):\n",
    "    seq_id = seq['seq_id']\n",
    "    for im in seq['images']:\n",
    "        \n",
    "        if 'empty' not in im['class']:\n",
    "            \n",
    "            src_path = os.path.join(container_root, im['file'])\n",
    "            assert os.path.exists(src_path), src_path\n",
    "            frame = im['frame_num']\n",
    "            dst_path = os.path.join('/mink_disk_0/camtraps/imerit12b', \n",
    "                                    f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "            path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7364"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/amapa_1819/Cameras_Praias_2019_Microsoft/BF52/SiteBF52-12.06/VID_0155_EK000002.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12b/amapa_1819.seq5513.frame155.jpg')"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)  # non-empty images out of total of 19221 (38%)\n",
    "path_pairs[-100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.59 s, sys: 21.5 s, total: 30 s\n",
      "Wall time: 1min 29s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with ThreadPool(8) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7364"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
